{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Logistic Regression\n",
    "\n",
    "Let's see an example of how to run a logistic regression with Python and Spark! This is documentation example, we will quickly run through this and then show a more realistic example, afterwards, you will have another consulting project!"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from pyspark.sql import SparkSession\n",
    "spark = SparkSession.builder.appName('logregdoc').getOrCreate()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from pyspark.ml.classification import LogisticRegression"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 86,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Load training data\n",
    "training = spark.read.format(\"libsvm\").load(\"sample_libsvm_data.txt\")\n",
    "\n",
    "lr = LogisticRegression()\n",
    "\n",
    "# Fit the model\n",
    "lrModel = lr.fit(training)\n",
    "\n",
    "trainingSummary = lrModel.summary"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 87,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-----+--------------------+--------------------+--------------------+----------+\n",
      "|label|            features|       rawPrediction|         probability|prediction|\n",
      "+-----+--------------------+--------------------+--------------------+----------+\n",
      "|  0.0|(692,[127,128,129...|[19.8534775947479...|[0.99999999761359...|       0.0|\n",
      "|  1.0|(692,[158,159,160...|[-20.377398194909...|[1.41321555110962...|       1.0|\n",
      "|  1.0|(692,[124,125,126...|[-27.401459284891...|[1.25804865127002...|       1.0|\n",
      "|  1.0|(692,[152,153,154...|[-18.862741612668...|[6.42710509170470...|       1.0|\n",
      "|  1.0|(692,[151,152,153...|[-20.483011833009...|[1.27157209200655...|       1.0|\n",
      "|  0.0|(692,[129,130,131...|[19.8506078990277...|[0.99999999760673...|       0.0|\n",
      "|  1.0|(692,[158,159,160...|[-20.337256674834...|[1.47109814695468...|       1.0|\n",
      "|  1.0|(692,[99,100,101,...|[-19.595579753418...|[3.08850168102550...|       1.0|\n",
      "|  0.0|(692,[154,155,156...|[19.2708803215615...|[0.99999999572670...|       0.0|\n",
      "|  0.0|(692,[127,128,129...|[23.6202328360424...|[0.99999999994480...|       0.0|\n",
      "|  1.0|(692,[154,155,156...|[-24.385235147660...|[2.56818872776620...|       1.0|\n",
      "|  0.0|(692,[153,154,155...|[26.3082522490181...|[0.99999999999624...|       0.0|\n",
      "|  0.0|(692,[151,152,153...|[25.8329060318707...|[0.99999999999396...|       0.0|\n",
      "|  1.0|(692,[129,130,131...|[-19.794609139087...|[2.53110684529387...|       1.0|\n",
      "|  0.0|(692,[154,155,156...|[21.0260440948067...|[0.99999999926123...|       0.0|\n",
      "|  1.0|(692,[150,151,152...|[-22.764979942873...|[1.29806018790960...|       1.0|\n",
      "|  0.0|(692,[124,125,126...|[21.5049307193955...|[0.99999999954235...|       0.0|\n",
      "|  0.0|(692,[152,153,154...|[31.9927184226426...|[0.99999999999998...|       0.0|\n",
      "|  1.0|(692,[97,98,99,12...|[-20.521067180413...|[1.22409115616575...|       1.0|\n",
      "|  1.0|(692,[124,125,126...|[-22.245377742755...|[2.18250475400430...|       1.0|\n",
      "+-----+--------------------+--------------------+--------------------+----------+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "trainingSummary.predictions.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 73,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# May change soon!\n",
    "from pyspark.mllib.evaluation import MulticlassMetrics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 74,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<pyspark.ml.classification.BinaryLogisticRegressionSummary at 0x10cb57208>"
      ]
     },
     "execution_count": 74,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "lrModel.evaluate(training)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 75,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Usually would do this on a separate test set!\n",
    "predictionAndLabels = lrModel.evaluate(training)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 76,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-----+--------------------+--------------------+--------------------+----------+\n",
      "|label|            features|       rawPrediction|         probability|prediction|\n",
      "+-----+--------------------+--------------------+--------------------+----------+\n",
      "|  0.0|(692,[127,128,129...|[19.8534775947479...|[0.99999999761359...|       0.0|\n",
      "|  1.0|(692,[158,159,160...|[-20.377398194909...|[1.41321555110962...|       1.0|\n",
      "|  1.0|(692,[124,125,126...|[-27.401459284891...|[1.25804865127002...|       1.0|\n",
      "|  1.0|(692,[152,153,154...|[-18.862741612668...|[6.42710509170470...|       1.0|\n",
      "|  1.0|(692,[151,152,153...|[-20.483011833009...|[1.27157209200655...|       1.0|\n",
      "|  0.0|(692,[129,130,131...|[19.8506078990277...|[0.99999999760673...|       0.0|\n",
      "|  1.0|(692,[158,159,160...|[-20.337256674834...|[1.47109814695468...|       1.0|\n",
      "|  1.0|(692,[99,100,101,...|[-19.595579753418...|[3.08850168102550...|       1.0|\n",
      "|  0.0|(692,[154,155,156...|[19.2708803215615...|[0.99999999572670...|       0.0|\n",
      "|  0.0|(692,[127,128,129...|[23.6202328360424...|[0.99999999994480...|       0.0|\n",
      "|  1.0|(692,[154,155,156...|[-24.385235147660...|[2.56818872776620...|       1.0|\n",
      "|  0.0|(692,[153,154,155...|[26.3082522490181...|[0.99999999999624...|       0.0|\n",
      "|  0.0|(692,[151,152,153...|[25.8329060318707...|[0.99999999999396...|       0.0|\n",
      "|  1.0|(692,[129,130,131...|[-19.794609139087...|[2.53110684529387...|       1.0|\n",
      "|  0.0|(692,[154,155,156...|[21.0260440948067...|[0.99999999926123...|       0.0|\n",
      "|  1.0|(692,[150,151,152...|[-22.764979942873...|[1.29806018790960...|       1.0|\n",
      "|  0.0|(692,[124,125,126...|[21.5049307193955...|[0.99999999954235...|       0.0|\n",
      "|  0.0|(692,[152,153,154...|[31.9927184226426...|[0.99999999999998...|       0.0|\n",
      "|  1.0|(692,[97,98,99,12...|[-20.521067180413...|[1.22409115616575...|       1.0|\n",
      "|  1.0|(692,[124,125,126...|[-22.245377742755...|[2.18250475400430...|       1.0|\n",
      "+-----+--------------------+--------------------+--------------------+----------+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "predictionAndLabels.predictions.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 77,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "predictionAndLabels = predictionAndLabels.predictions.select('label','prediction')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 78,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-----+----------+\n",
      "|label|prediction|\n",
      "+-----+----------+\n",
      "|  0.0|       0.0|\n",
      "|  1.0|       1.0|\n",
      "|  1.0|       1.0|\n",
      "|  1.0|       1.0|\n",
      "|  1.0|       1.0|\n",
      "|  0.0|       0.0|\n",
      "|  1.0|       1.0|\n",
      "|  1.0|       1.0|\n",
      "|  0.0|       0.0|\n",
      "|  0.0|       0.0|\n",
      "|  1.0|       1.0|\n",
      "|  0.0|       0.0|\n",
      "|  0.0|       0.0|\n",
      "|  1.0|       1.0|\n",
      "|  0.0|       0.0|\n",
      "|  1.0|       1.0|\n",
      "|  0.0|       0.0|\n",
      "|  0.0|       0.0|\n",
      "|  1.0|       1.0|\n",
      "|  1.0|       1.0|\n",
      "+-----+----------+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "predictionAndLabels.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Evaluators\n",
    "\n",
    "Evaluators will be a very important part of our pipline when working with Machine Learning, let's see some basics for Logistic Regression, useful links:\n",
    "\n",
    "https://spark.apache.org/docs/latest/api/python/pyspark.ml.html#pyspark.ml.evaluation.BinaryClassificationEvaluator\n",
    "\n",
    "https://spark.apache.org/docs/latest/api/python/pyspark.ml.html#pyspark.ml.evaluation.MulticlassClassificationEvaluator"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 79,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "from pyspark.ml.evaluation import BinaryClassificationEvaluator,MulticlassClassificationEvaluator"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 89,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "evaluator = BinaryClassificationEvaluator(rawPredictionCol='prediction', labelCol='label')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 83,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# For multiclass\n",
    "evaluator = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='label',\n",
    "                                             metricName='accuracy')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 90,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "acc = evaluator.evaluate(predictionAndLabels)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 91,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1.0"
      ]
     },
     "execution_count": 91,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "acc"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Okay let's move on see some more examples!"
   ]
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python [conda root]",
   "language": "python",
   "name": "conda-root-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
